############################################################
# Summary: This script reproduces primarily the output for the chapter "Exploring Sensitivity to Partisan Differences (External Validation)"

# Output:
# - Figure 5: Correspondence of aggregated politicians’ shares for each party for "communion"
# - Figure Appendix: Correspondence of aggregated politicians’ shares for each party for "agency"


df_pred_svm <- read_csv("data/results/unlabaled_prediction_svm.csv") |> 
  pivot_longer(cols = starts_with("pred_label_seed"), 
               names_to = "seed", 
               values_to = "pred_label") %>%
  mutate(seed = gsub("pred_label_seed", "", seed),
         variation = dataset,
         seed = as.integer(seed)) |> 
  mutate(seed = case_when(
    seed == 999 ~ 123, #make seeds consistent
    T ~ seed)) |> 
  select(text, pred_label, seed, variation, party, source, gender, method,name)

df_pred_semi <- read_csv("data/results/unlabeled_predictions_semisupvervised.csv") |> 
  left_join(df_pred_svm |>  distinct(text, source, party, gender, name), by = "text") |> 
  filter(dataset == "unlabeled") |> 
  mutate(seed = as.integer(seed),
         pred_label = case_when(
           pred_seed == 0 ~ "none",
           pred_seed == 1 ~ "emp",
           pred_seed == 2 ~ "dur"
         ),
         method = "xlm-roberta-base") |> 
  filter(!is.na(party)) |> 
  select(text, pred_label, seed, variation = dataset_label, party, source, gender, method,name)

df_pred_promp_deepseek <- read_csv("data/results/unlabaled_prediction_prompting_deepseek.csv") |>
  filter(random_seed == 42) |>
  rename(seed = random_seed, pred_label= model_label, method = model) |>
  mutate(variation = shot_type) |>
  select(text, pred_label, seed, variation, party, source, gender, method,name)

df_pred_promp_gpt<- read_csv("data/results/unlabaled_prediction_prompting_gpt.csv") |> 
  filter(random_seed == 42) |> 
  rename(seed = random_seed, pred_label= model_label_gpt_few, method = model) |> 
  mutate(pred_label = sub(",.*", "", pred_label)) |>  # Extract first word before comma
  mutate(variation = shot_type) |> 
  select(text, pred_label, seed, variation, party, source, gender, method,name)

df_pred_promp_llama <- read_csv("data/results/unlabaled_prediction_prompting_llama.csv") |> 
  filter(random_seed == 42) |> 
  rename(seed = random_seed, pred_label= model_label_llama_few, method = model) |> 
  mutate(pred_label = sub(",.*", "", pred_label)) |>  # Extract first word before comma
  mutate(variation = shot_type) |> 
  select(text, pred_label, seed, variation, party, source, gender, method,name)



df_unlabeled <- bind_rows(
  df_pred_promp_gpt,
  df_pred_promp_deepseek,
  df_pred_svm,
  df_pred_promp_llama,
  df_pred_semi)



# Create the summarized table with share computed
df_summary <- df_unlabeled %>% 
  filter(seed == 42 & (variation == "few" | variation == "combined_balanced")) %>% 
  group_by(method,  pred_label,source) %>%  # Group by method, source, and cue
  summarise(count = n(), .groups = "drop") %>%  # Count occurrences
  group_by(source) %>% 
  mutate(share = count / sum(count)) %>%  # Compute share for each source
  ungroup()

# Sort the summary table by cue and descending share
df_sorted <- df_summary %>% 
  #filter(method == "deepseak") |> 
  arrange(method, pred_label, desc(share)) |> 
  filter(pred_label != "none")

# Split into a list of tables by pred_label
tables_by_pred_label <- split(df_sorted, df_sorted$method)

# To view each table, you could print them out:
tables_by_pred_label[["svm"]]   # Table for cue "dur"
tables_by_pred_label[["xlm-roberta-base"]]   # Table for cue "emp"
tables_by_pred_label[["llama"]]  # Table for cue "none"
tables_by_pred_label[["deepseak"]]  # Table for cue "none"
tables_by_pred_label[["gpt"]]  # Table for cue "none"

df_unlabeled %>% 
  filter(seed == 42 & (variation == "few" | variation == "combined_balanced")) %>% 
  group_by(method,pred_label,source) |> 
  summarise(overall_countlabel = n()) |> 
  group_by(source, method) |> 
  mutate(all = sum(overall_countlabel)) |> 
  ungroup() |> 
  mutate(share = overall_countlabel / all) |> 
  filter(pred_label == "none") |> 
  arrange(method, desc(share)) |> 
  select(method, source, share) |> 
  mutate(share = round(share, 2)) |> 
  clipr::write_clip()



### Visualization

label_input <- "emp"
shares_emp <- df_unlabeled %>%
  filter(!is.na(party)) %>%
  filter(seed == 42, variation %in% c("few", "combined_balanced"), method == "deepseak") %>%
  # 1) counts per name within method × pred_label × party
  group_by(method, party, name) %>%
  mutate(n_cues = n(), .groups = "drop") %>%
  ungroup() |> 
  group_by(method, party, name, pred_label) |> 
  mutate(n_labelcues = n(), .groups = "drop") %>%
  ungroup() |> 
  select(name, pred_label, party, n_labelcues, n_cues,method) |> 
  filter(n_cues >500) |> 
  mutate(share_name = n_labelcues / n_cues)  |> 
  filter(pred_label == label_input) %>%
  # 4) aggregate over names: mean and sd of name-level shares
  group_by(method, party) %>%
  summarise(share_mean = mean(share_name),
            share_sd   = sd(share_name),
            .groups = "drop") %>%
  arrange(method, desc(share_mean)) %>%
  mutate(across(c(share_mean, share_sd), ~ round(.x, 3)))

label_input <- "dur"
shares_dur <- df_unlabeled %>%
  filter(!is.na(party)) %>%
  filter(seed == 42, variation %in% c("few", "combined_balanced"), method == "deepseak") %>%
  # 1) counts per name within method × pred_label × party
  group_by(method, party, name) %>%
  mutate(n_cues = n(), .groups = "drop") %>%
  ungroup() |> 
  group_by(method, party, name, pred_label) |> 
  mutate(n_labelcues = n(), .groups = "drop") %>%
  ungroup() |> 
  select(name, pred_label, party, n_labelcues, n_cues,method) |> 
  filter(n_cues >200) |> 
  mutate(share_name = n_labelcues / n_cues)  |> 
  filter(pred_label == label_input) %>%
  # 4) aggregate over names: mean and sd of name-level shares
  group_by(method, party) %>%
  summarise(share_mean = mean(share_name),
            share_sd   = sd(share_name),
            .groups = "drop") %>%
  arrange(method, desc(share_mean)) %>%
  mutate(across(c(share_mean, share_sd), ~ round(.x, 3)))


print(shares_emp)
print(shares_dur)


clipr::write_clip(shares_emp)

party_colors <- c(
  "LINKE" = "#BE3075",   # Left Party (purple-pink)
  "GREENS" = "#64A12D",  # Greens
  "SPD" = "#E3000F",     # SPD (red)
  "CDU" = "#000000",     # CDU (black)
  "AfD" = "#009EE0",     # AfD (light blue)
  "FDP" = "#FFED00",     # FDP (yellow)
  "CSU" = "#008AC5"      # CSU (blue)
)


# Expert left-right placement
lr_scores <- read_csv("data/parties/chapel.csv") |> 
  filter(year == 2019 & party %in% c("CDU","SPD","GRUNEN","CSU","FDP","LINKE","AfD") & country == 3) |> 
  mutate(party = case_when(
    party == "GRUNEN" ~ "GREENS",
    T ~ party)) |> 
  dplyr::select(party, lrecon,lrgen,galtan) |> 
  rename(overall_ideological_stance = lrgen, lr_val = galtan)



# Merge
df_plot_emp <- shares_emp %>%
  mutate(party = case_when(
    party == "AFD" ~ "AfD",
    T ~ party)) |> 
  left_join(lr_scores, by = "party") |> 
  pivot_longer(cols = c(lrecon,lr_val),names_to = "ideology", values_to = "value") |> 
  mutate(ideology = case_when(
    ideology == "lr_val" ~ "Libertarian-Authoritarian Dimension",
    ideology == "lrecon" ~ "Economic Dimension"
  ))



# Scatter Plot with regression line
ggplot(df_plot_emp, aes(x = value, y = share_mean, label = party, color = party)) +
  geom_errorbar(aes(ymin = pmax(0, share_mean - share_sd),
                    ymax = pmin(1, share_mean + share_sd)),
                width = 0) +
  geom_point(size = 5) +
  facet_grid(~ideology) +
  geom_smooth(method = "lm", se = FALSE, color = "grey30", linetype = "dashed") +
  geom_label(nudge_y = 0.015, size = 3) +
  scale_color_manual(values = party_colors) +
  scale_x_continuous(breaks = c(0,2.5, 5,7.5,10),limits = c(0, 10)) +
  labs(
    x = "Left-Right Placement (2019 Chapel Hill Expert Rating)",
    y = "Share of Communion Cues",
  ) +
  theme_minimal() +
  xlim(1,10)+
  theme(
    legend.position = "none",
    strip.background = element_rect(fill = "white", color = NA), # no box, just white fill
    panel.border = element_rect(color = "grey85", fill = NA, linewidth = 0.2),
    panel.grid.major = element_line(color = "grey95", linewidth = 0.2),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 12),
    strip.text = element_text(face = "bold", size = 12)
  )

#Figure 
ggsave("output/Figure_5_leftright_communion.png", height = 5,width = 9)


df_plot_dur <- shares_dur %>%
  mutate(party = case_when(
    party == "AFD" ~ "AfD",
    T ~ party)) |> 
  left_join(lr_scores, by = "party") |> 
  pivot_longer(cols = c(lrecon,lr_val),names_to = "ideology", values_to = "value") |> 
  mutate(ideology = case_when(
    ideology == "lr_val" ~ "Libertarian-Authoritarian Dimension",
    ideology == "lrecon" ~ "Economic Dimension"
  ))


# Scatter Plot with regression line
ggplot(df_plot_dur, aes(x = value, y = share_mean, label = party, color = party)) +
  geom_errorbar(aes(ymin = pmax(0, share_mean - share_sd),
                    ymax = pmin(1, share_mean + share_sd)),
                width = 0) +
  geom_point(size = 5) +
  facet_grid(~ideology) +
  geom_smooth(method = "lm", se = FALSE, color = "grey30", linetype = "dashed") +
  geom_label(nudge_y = 0.015, size = 3) +
  scale_color_manual(values = party_colors) +
  scale_x_continuous(breaks = c(0,2.5, 5,7.5,10),limits = c(0, 10)) +
  labs(
    x = "Left-Right Placement (2019 Chapel Hill Expert Rating)",
    y = "Share of Agency Cues",
  ) +
  theme_minimal() +
  xlim(1,10)+
  theme(
    legend.position = "none",
    strip.background = element_rect(fill = "white", color = NA), # no box, just white fill
    panel.border = element_rect(color = "grey85", fill = NA, linewidth = 0.2),
    panel.grid.major = element_line(color = "grey95", linewidth = 0.2),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 12),
    strip.text = element_text(face = "bold", size = 12)
  )

ggsave("output/Figure_Appendix_leftright_agency.png", height = 5,width = 9)

### Count of Politicians for Interview data
df_unlabeled |> 
  filter(seed == 42, variation %in% c("few", "combined_balanced"), method == "deepseak", source == "interviews") |> 
  count(name) |> 
  arrange(desc(n))


# ---- DATA PREP ----
# Communion
shares_emp <- df_unlabeled %>%
  mutate(gender= case_when(
    gender == "female" ~ "Female",
    gender == "male" ~ "Male"
  )) |> 
  filter(!is.na(party)) %>%
  filter(seed == 42 & variation %in% c("few", "combined_balanced")) %>%
  group_by(method, pred_label, name, gender, party) %>%
  summarise(overall_countlabel = n()) %>%
  group_by(name, party, method) %>%
  mutate(all = sum(overall_countlabel)) %>%
  ungroup() %>%
  mutate(share = overall_countlabel / all) %>%
  filter(pred_label == "emp") %>%
  select(method, name, party, gender,  share) %>%
  mutate(Share = round(share, 2))

# Agency
shares_dur <- df_unlabeled %>%
  mutate(gender= case_when(
    gender == "female" ~ "Female",
    gender == "male" ~ "Male"
  )) |> 
  filter(!is.na(party)) %>%
  filter(seed == 42 & variation %in% c("few", "combined_balanced")) %>%
  group_by(method, pred_label, name, gender, party) %>%
  summarise(overall_countlabel = n()) %>%
  group_by(name, party, method) %>%
  mutate(all = sum(overall_countlabel)) %>%
  ungroup() %>%
  mutate(share = overall_countlabel / all) %>%
  filter(pred_label == "dur") %>%
  select(method, name, party, gender, share) %>%
  mutate(Share = round(share, 2))


# ---- JITTER + OUTLIER LABELING ----
set.seed(123)

prepare_jittered_data <- function(data) {
  data %>%
    filter(method == "deepseak") %>%
    mutate(
      gender_num = as.numeric(factor(gender)),
      x_jittered = gender_num + runif(n(), -0.2, 0.2)
    ) %>%
    group_by(gender) %>%
    mutate(
      q1 = quantile(share, 0.25),
      q3 = quantile(share, 0.75),
      iqr = q3 - q1,
      label_me = share < (q1 - 0.3 * iqr) | share > (q3 + 0.3 * iqr)
    ) %>%
    ungroup()
}

desired_order <- c("LINKE", "GREENS", "SPD", "FDP", "CDU", "CSU", "AFD")

shares_jittered_emp <- prepare_jittered_data(shares_emp)
shares_jittered_dur <- prepare_jittered_data(shares_dur)
shares_jittered_emp$party <- factor(shares_jittered_emp$party, levels = desired_order)
shares_jittered_dur$party <- factor(shares_jittered_dur$party, levels = desired_order)

theme_individual <- function(base_size = 12, base_family = "") {
  theme_minimal(base_size = base_size, base_family = base_family) +
    theme(
      legend.text = element_text(size = 12),
      legend.title = element_text(size = 12, face = "bold"),
      axis.title.x = element_blank(),
      axis.text = element_text(size = 11),
      strip.background = element_rect(fill = "white", color = "black", linewidth = 0.5),
      panel.border = element_rect(color = "black", fill = NA, linewidth = 0.4),
      plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
      strip.text = element_text(face = "bold", size = 12),
      plot.margin = margin(t = 20, r = 10, b = 10, l = 10)
    )
}




# ---- PLOT 1: Communion ----
p1 <- ggplot(shares_jittered_emp, aes(x = gender, y = Share)) +
  geom_boxplot(outlier.shape = NA, fill = "gray80", color = "gray30", linewidth = 0.5) +
  geom_point(aes(x = x_jittered, color = party), size = 2.5, alpha = 0.6) +
  geom_label_repel(
    data = filter(shares_jittered_emp, label_me),
    aes(x = x_jittered, y = Share, label = name, color = party),
    size = 3.5, box.padding = 0.3, point.padding = 0.2,
    segment.alpha = 0.3, show.legend = FALSE, max.overlaps = Inf
  ) +
  scale_color_manual(values = party_colors, name = "Party Affiliation") +
  theme_individual() +
  ggtitle("Individual Communion Shares") +
  guides(color = guide_legend(nrow = 1))



# ---- PLOT 2: Agency ----
p2 <- ggplot(shares_jittered_dur, aes(x = gender, y = Share)) +
  geom_boxplot(outlier.shape = NA, fill = "gray80", color = "gray30", linewidth = 0.5) +
  geom_point(aes(x = x_jittered, color = party), size = 2.5, alpha = 0.6) +
  geom_label_repel(
    data = filter(shares_jittered_dur, label_me),
    aes(x = x_jittered, y = Share, label = name, color = party),
    size = 3.5, box.padding = 0.3, point.padding = 0.2,
    segment.alpha = 0.3, show.legend = FALSE, max.overlaps = Inf) +
  scale_color_manual(values = party_colors, name = "Party Affiliation") +
  theme_individual() +
  ggtitle("Individual Agency Shares") +
  guides(color = guide_legend(nrow = 1))


# ---- Combine Plots ----
p3 <- ggpubr::ggarrange(p1, p2, nrow = 1, common.legend = TRUE, legend = "bottom")
annotate_figure(
  p3, 
  #top = text_grob("Gender Differences in Agency and Communion Scores", color = "black", face = "bold", size = 16)
)

ggsave("output/Figure_Appendix_gender_differences.png", width = 10, height = 7)

# Clean and capitalize source labels
df_bar_errors <- df_unlabeled %>%
  filter(seed == 42,
         variation %in% c("few", "combined_balanced"),
         method == "deepseak",
         pred_label %in% c("none","emp", "dur")) %>%
  group_by(pred_label, source) %>%
  summarise(
    n = n(),
    p = n() / 15000,
    se = sqrt(p * (1 - p) / 15000),
    .groups = "drop"
  ) %>%
  mutate(
    lower = p - 1.96 * se,
    upper = p + 1.96 * se,
    source = stringr::str_to_title(source)  # Capitalize source names
  )

# Plot
ggplot(df_bar_errors |> filter(pred_label != "none"), aes(x = source, y = p, fill = pred_label)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, alpha = 0.9) +
  geom_errorbar(aes(ymin = lower, ymax = upper), 
                position = position_dodge(width = 0.7), width = 0.2, color = "black") +
  facet_grid(~pred_label, labeller = as_labeller(c(emp = "Communion", dur = "Agency"))) +
  scale_fill_manual(values = c("emp" = "#1f77b4", "dur" = "#ff7f0e")) +
  labs(
    x = "Source",
    y = "Share of Predicted Cues"
    # <- title removed
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.background = element_rect(fill = "white", color = "black", linewidth = 0.5),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 0.4),
    strip.text = element_text(face = "bold", size = 12),
    axis.title = element_text(face = "bold"),
    plot.title = element_blank()  # ensure title is removed
  )

ggsave("output/Figure_Appendix_source_distribution.png", height = 5,width = 9)
